Loading Libraries
require("readxl")
## Loading required package: readxl
## Warning: package 'readxl' was built under R version 3.5.3
library(readxl)
require("dplyr")
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(dplyr)
require("tidyr")
## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 3.5.3
library(tidyr)
require("rfm")
## Loading required package: rfm
## Warning: package 'rfm' was built under R version 3.5.3
library(rfm)
require("data.table")
## Loading required package: data.table
## Warning: package 'data.table' was built under R version 3.5.3
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(data.table)
Importing data
cust_agg_tran <- read_excel("cust agg transaction data.xlsx")
head(cust_agg_tran)
## # A tibble: 6 x 9
## customer_Number billCount item_Count MaxTransactionDate
## <dbl> <dbl> <dbl> <dttm>
## 1 1 1 1 2017-02-11 00:00:00
## 2 2 1 4 2017-02-10 00:00:00
## 3 3 1 1 2017-02-14 00:00:00
## 4 4 1 2 2017-02-20 00:00:00
## 5 5 1 2 2017-02-28 00:00:00
## 6 6 1 1 2017-02-15 00:00:00
## # ... with 5 more variables: MinTransactionDate <dttm>, MaxAmount <dbl>,
## # TotalAmount <dbl>, sale_Amount <dbl>, sale_Percent <dbl>
reference_file <- read_excel("Reference file.xlsx")
head(reference_file)
## # A tibble: 6 x 12
## MemberCardNumber Customer_Group Revenue Invoice Nationality_Gro~
## <dbl> <chr> <dbl> <dbl> <chr>
## 1 1 2.NEWLY ACTIVE 4.25 1 REMAINING
## 2 2 2.NEWLY ACTIVE 12.5 1 REMAINING
## 3 3 2.NEWLY ACTIVE 4.1 1 REMAINING
## 4 4 2.NEWLY ACTIVE 6.25 1 REMAINING
## 5 5 2.NEWLY ACTIVE 32.4 1 REMAINING
## 6 6 2.NEWLY ACTIVE 4.7 1 REMAINING
## # ... with 7 more variables: NationalityClassification <chr>,
## # Stateclassification <chr>, AgeClassification <chr>, RevenueBand <chr>,
## # InvoiceBand <chr>, OUTLIEROVERALL <chr>, Sampling <chr>
Combining data of reference and cust_agg_tran in one
orignalData <- cbind(cust_agg_tran[1:49811,], reference_file)
orignalData<-
orignalData %>% separate(Customer_Group, into=c("Customer_Group_Num", "Customer_Group"), sep= "\\.") %>%
separate(RevenueBand, into=c("RevenueBand_Num", "RevenueBand"), sep= "\\.")%>%
separate(InvoiceBand, into=c("InvoiceBand_Num", "InvoiceBand"), sep= "\\.") %>%
separate(Sampling, into=c("Sampling_Num", "Sampling"), sep= "\\.")
head(orignalData)
## customer_Number billCount item_Count MaxTransactionDate
## 1 1 1 1 2017-02-11
## 2 2 1 4 2017-02-10
## 3 3 1 1 2017-02-14
## 4 4 1 2 2017-02-20
## 5 5 1 2 2017-02-28
## 6 6 1 1 2017-02-15
## MinTransactionDate MaxAmount TotalAmount sale_Amount sale_Percent
## 1 2017-02-11 44 44 0 0
## 2 2017-02-10 45 155 155 1
## 3 2017-02-14 25 25 25 1
## 4 2017-02-20 25 45 45 1
## 5 2017-02-28 25 50 50 1
## 6 2017-02-15 35 35 35 1
## MemberCardNumber Customer_Group_Num Customer_Group Revenue Invoice
## 1 1 2 NEWLY ACTIVE 4.25 1
## 2 2 2 NEWLY ACTIVE 12.50 1
## 3 3 2 NEWLY ACTIVE 4.10 1
## 4 4 2 NEWLY ACTIVE 6.25 1
## 5 5 2 NEWLY ACTIVE 32.41 1
## 6 6 2 NEWLY ACTIVE 4.70 1
## Nationality_Group NationalityClassification Stateclassification
## 1 REMAINING REMAINING NONE
## 2 REMAINING REMAINING NONE
## 3 REMAINING REMAINING NONE
## 4 REMAINING REMAINING NONE
## 5 REMAINING REMAINING NONE
## 6 REMAINING REMAINING NONE
## AgeClassification RevenueBand_Num RevenueBand InvoiceBand_Num
## 1 REMAINING 1 Below 25 1
## 2 REMAINING 3 50-75 1
## 3 REMAINING 1 Below 25 1
## 4 REMAINING 2 25-50 1
## 5 REMAINING 4 Above 75 1
## 6 REMAINING 1 Below 25 1
## InvoiceBand OUTLIEROVERALL Sampling_Num Sampling
## 1 Below 25 N 1 TG
## 2 Below 25 N 1 TG
## 3 Below 25 N 1 TG
## 4 Below 25 N 1 TG
## 5 Below 25 N 1 TG
## 6 Below 25 N 1 TG
STEP 1
#From the Transaction Agg file,
#select customers classified as "Active"
#under the column Customer Group the reference file
active_Customers<- filter(orignalData, Customer_Group=="ACTIVE")
head(active_Customers)
## customer_Number billCount item_Count MaxTransactionDate
## 1 22 1 2 2017-02-10
## 2 23 1 3 2017-02-08
## 3 25 1 2 2017-02-20
## 4 26 1 2 2017-02-17
## 5 27 1 1 2017-02-08
## 6 29 1 4 2017-02-10
## MinTransactionDate MaxAmount TotalAmount sale_Amount sale_Percent
## 1 2017-02-10 30 60 60 1.0000
## 2 2017-02-08 40 119 80 0.6722
## 3 2017-02-20 65 114 0 0.0000
## 4 2017-02-17 25 30 5 0.1666
## 5 2017-02-08 35 35 35 1.0000
## 6 2017-02-10 40 85 85 1.0000
## MemberCardNumber Customer_Group_Num Customer_Group Revenue Invoice
## 1 22 1 ACTIVE 28.15 1
## 2 23 1 ACTIVE 11.05 1
## 3 25 1 ACTIVE 9.80 1
## 4 26 1 ACTIVE 13.10 1
## 5 27 1 ACTIVE 16.20 5
## 6 29 1 ACTIVE 30.30 1
## Nationality_Group NationalityClassification Stateclassification
## 1 KUWAIT KUWAIT NONE
## 2 KUWAIT KUWAIT NONE
## 3 KUWAIT KUWAIT NONE
## 4 Indian Sub Continent INDIAN SUB CONTINENT NONE
## 5 KUWAIT KUWAIT NONE
## 6 KUWAIT KUWAIT NONE
## AgeClassification RevenueBand_Num RevenueBand InvoiceBand_Num
## 1 REMAINING 2 25-50 1
## 2 18-25 and 26-33 1 Below 25 1
## 3 18-25 and 26-33 1 Below 25 1
## 4 REMAINING 2 25-50 1
## 5 REMAINING 2 25-50 3
## 6 REMAINING 2 25-50 1
## InvoiceBand OUTLIEROVERALL Sampling_Num Sampling
## 1 Below 25 N 1 TG
## 2 Below 25 N 1 TG
## 3 Below 25 N 1 TG
## 4 Below 25 N 1 TG
## 5 50-75 N 1 TG
## 6 Below 25 N 1 TG
STEP 2
#Further filter customers in step 1 for Sampling = "TG"
act_TG_Customers <- filter(active_Customers, Sampling=="TG")
DT::datatable(act_TG_Customers)
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
#STEP 3
#From step 2, Select the customers
#whose Avg Transaction Value (Amount per bill) is
#above KD 25 in the transaction file
act_TG_Customers$AmountPerBill <- act_TG_Customers$TotalAmount/act_TG_Customers$billCount
act_TG_KD25_Customers <- filter(act_TG_Customers, AmountPerBill> 25.0)
head(act_TG_KD25_Customers)
## customer_Number billCount item_Count MaxTransactionDate
## 1 22 1 2 2017-02-10
## 2 23 1 3 2017-02-08
## 3 25 1 2 2017-02-20
## 4 26 1 2 2017-02-17
## 5 27 1 1 2017-02-08
## 6 29 1 4 2017-02-10
## MinTransactionDate MaxAmount TotalAmount sale_Amount sale_Percent
## 1 2017-02-10 30 60 60 1.0000
## 2 2017-02-08 40 119 80 0.6722
## 3 2017-02-20 65 114 0 0.0000
## 4 2017-02-17 25 30 5 0.1666
## 5 2017-02-08 35 35 35 1.0000
## 6 2017-02-10 40 85 85 1.0000
## MemberCardNumber Customer_Group_Num Customer_Group Revenue Invoice
## 1 22 1 ACTIVE 28.15 1
## 2 23 1 ACTIVE 11.05 1
## 3 25 1 ACTIVE 9.80 1
## 4 26 1 ACTIVE 13.10 1
## 5 27 1 ACTIVE 16.20 5
## 6 29 1 ACTIVE 30.30 1
## Nationality_Group NationalityClassification Stateclassification
## 1 KUWAIT KUWAIT NONE
## 2 KUWAIT KUWAIT NONE
## 3 KUWAIT KUWAIT NONE
## 4 Indian Sub Continent INDIAN SUB CONTINENT NONE
## 5 KUWAIT KUWAIT NONE
## 6 KUWAIT KUWAIT NONE
## AgeClassification RevenueBand_Num RevenueBand InvoiceBand_Num
## 1 REMAINING 2 25-50 1
## 2 18-25 and 26-33 1 Below 25 1
## 3 18-25 and 26-33 1 Below 25 1
## 4 REMAINING 2 25-50 1
## 5 REMAINING 2 25-50 3
## 6 REMAINING 2 25-50 1
## InvoiceBand OUTLIEROVERALL Sampling_Num Sampling AmountPerBill
## 1 Below 25 N 1 TG 60
## 2 Below 25 N 1 TG 119
## 3 Below 25 N 1 TG 114
## 4 Below 25 N 1 TG 30
## 5 50-75 N 1 TG 35
## 6 Below 25 N 1 TG 85
STEP 4
#From step 3, Identify the customers for targeting
#using Recency, Frequency and Monetary analysis (google it if not known).
#Use max transaction date, bill # and Amount respectively for the 3 fields.
#Select the best 50% customers
todaydate <- Sys.Date()
RFM_act_TG_KD25 <- data.frame(customer_id = act_TG_KD25_Customers$customer_Number,
revenue = act_TG_KD25_Customers$TotalAmount,
most_recent_visit = act_TG_KD25_Customers$MaxTransactionDate,
number_of_orders = act_TG_KD25_Customers$billCount,
recency_days = difftime(todaydate, act_TG_KD25_Customers$MaxTransactionDate)
)
head(RFM_act_TG_KD25)
## customer_id revenue most_recent_visit number_of_orders recency_days
## 1 22 60 2017-02-10 1 763 days
## 2 23 119 2017-02-08 1 765 days
## 3 25 114 2017-02-20 1 753 days
## 4 26 30 2017-02-17 1 756 days
## 5 27 35 2017-02-08 1 765 days
## 6 29 85 2017-02-10 1 763 days
analysis_date <- max(act_TG_KD25_Customers$MaxTransactionDate)
### Using "rfm_table_customer()" function from rfm package
rfm_result <- rfm_table_customer(RFM_act_TG_KD25, customer_id, number_of_orders, recency_days, revenue, analysis_date)
rfm_data1<- data.frame(rfm_result$rfm)
rfm_data1<-arrange(rfm_data1, desc(rfm_data1$rfm_score))
head(rfm_data1)
## customer_id recency_days transaction_count amount recency_score
## 1 123 755 days 4 208.00 5
## 2 127 752 days 4 458.00 5
## 3 212 749 days 3 749.25 5
## 4 265 766 days 3 793.00 5
## 5 324 762 days 5 609.50 5
## 6 328 745 days 3 280.25 5
## frequency_score monetary_score rfm_score
## 1 5 5 555
## 2 5 5 555
## 3 5 5 555
## 4 5 5 555
## 5 5 5 555
## 6 5 5 555
############ TOP 50 % Data #############
rfm_top50percent_data_TG <- rfm_data1[1:(nrow(rfm_data1)/2),]
head(rfm_top50percent_data_TG)
## customer_id recency_days transaction_count amount recency_score
## 1 123 755 days 4 208.00 5
## 2 127 752 days 4 458.00 5
## 3 212 749 days 3 749.25 5
## 4 265 766 days 3 793.00 5
## 5 324 762 days 5 609.50 5
## 6 328 745 days 3 280.25 5
## frequency_score monetary_score rfm_score
## 1 5 5 555
## 2 5 5 555
## 3 5 5 555
## 4 5 5 555
## 5 5 5 555
## 6 5 5 555
#######################################
########### VISUALIZATONS FOR RFM ANALYSIS
#The heat map shows the average monetary value for different categories of recency and frequency scores.
rfm_heatmap(rfm_result)

#to generate the distribution of monetary scores for the different combinations of frequency and recency scores.
rfm_bar_chart(rfm_result)

# To examine the relative distribution of monetary value (total revenue generated by each customer)
#recency days (days since the most recent visit for each customer)
#frequency (transaction count for each customer)
rfm_histograms(rfm_result)
## Warning: attributes are not identical across measure variables;
## they will be dropped

#Visualize the distribution of customers across orders.
rfm_order_dist(rfm_result)

#Recency vs Monetary Value
rfm_rm_plot(rfm_result)
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

#Frequency vs Monetary Value
rfm_fm_plot(rfm_result)

#Recency vs Frequency
rfm_rf_plot(rfm_result)
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

############# RFM Calculation with out using the RFM package ######################
# customer_Number<- act_tg_kd25_cust_agg_tran$customer_Number
# recent_Transanction <- act_tg_kd25_cust_agg_tran$MaxTransactionDate
# bill_Count <- act_tg_kd25_cust_agg_tran$billCount
# total_Amount <- act_tg_kd25_cust_agg_tran$TotalAmount
#
# rfm <- data.frame(customer_Number, recent_Transanction, bill_Count, total_Amount)
# head(rfm)
#
# todaydate <- Sys.Date()
#
# rfm <- mutate(rfm, recency = difftime(todaydate, rfm$recent_Transanction))
# head(rfm)
# rfm <- mutate(rfm, frequency = rfm$bill_Count)
# rfm$bill_Count <- as.numeric(rfm$bill_Count)
# rfm$frequency <- as.numeric(rfm$frequency)
# rfm$recency <- as.character(rfm$recency)
# rfm$recency <- as.numeric(rfm$recency)
# rfm$monetary <- rfm$total_Amount / rfm$bill_Count
# summary(rfm)
#
# head(rfm)
#
# df<-rfm
#
# #start function for assigining score 1- lowest importance, 5- highest importance
# scoring <- function (df,column,r=5){
#
# #get the length of rows of df
# len <- dim(df)[1]
#
# score <- rep(0,times=len)
#
# # get the quantity of rows per 1/r e.g. 1/5
# nr <- round(len / r)
# if (nr > 0){
#
# # seperate the rows by r aliquots
# rStart <-0
# rEnd <- 0
# for (i in 1:r){
#
# #set the start row number and end row number
# rStart = rEnd+1
#
# #skip one "i" if the rStart is already in the i+1 or i+2 or ...scope.
# if (rStart> i*nr) next
#
# if (i == r){
# if(rStart<=len ) rEnd <- len else next
# }else{
# rEnd <- i*nr
# }
#
# # set the Recency score
# score[rStart:rEnd]<- r-i+1
#
# # make sure the customer who have the same recency have the same score
# s <- rEnd+1
# if(i<r & s <= len){
# for(u in s: len){
# if(df[rEnd,column]==df[u,column]){
# score[u]<- r-i+1
# rEnd <- u
# }else{
# break;
# }
# }
#
# }
#
# }
#
# }
# return(score)
#
# }
#
# #end of function Scoring
# head(df)
#
# df<-df %>% arrange(df$recency)
# R_Score <- scoring(df,"recency",r=5)
# df <- cbind(df, R_Score)
# head(df)
#
# df <- df%>% arrange(desc(df$frequency))
# head(df)
# F_Score <- scoring(df,"frequency",5)
# df <- cbind(df, F_Score)
# head(df)
#
#
# df <- df%>% arrange(desc(df$monetary))
#
# M_Score <- scoring(df,"monetary",5)
# df <- cbind(df, M_Score)
# head(df)
# #order the dataframe by R_Score, F_Score, and M_Score desc
# df <- arrange(df, desc(R_Score), desc(F_Score), desc(M_Score))
# head(df)
#
# # caculate the total score
# Total_Score <- c(100*df$R_Score + 10*df$F_Score+df$M_Score)
#
# df <- cbind(df,Total_Score)
# glimpse(df)
#
#
# # get best 50% from df
# x<-length(df[,1])/2
# df1<- df[1:x, ]
# glimpse(df1)
# str(df1)
#
##########################################################################################
STEP 5
#Select RG as a sample from the reference file
#such that the RG selected represents the selected customers in step 4
# (use unpaired homoskedastic t-test) such that the ratio of RG: TG is 1:10.
active_rg_KD25_data <- filter(active_Customers, Sampling=="RG"
& (TotalAmount/billCount)>25.0)
head(active_rg_KD25_data)
## customer_Number billCount item_Count MaxTransactionDate
## 1 73 1 1 2017-02-18
## 2 141 1 1 2016-12-10
## 3 207 1 3 2017-01-02
## 4 213 4 16 2017-02-13
## 5 230 1 3 2016-12-16
## 6 288 1 2 2017-01-21
## MinTransactionDate MaxAmount TotalAmount sale_Amount sale_Percent
## 1 2017-02-18 35.00 35.00 0.00 0.0000
## 2 2016-12-10 27.75 27.75 27.75 1.0000
## 3 2017-01-02 39.00 57.00 57.00 1.0000
## 4 2016-12-06 87.00 664.25 416.25 0.6266
## 5 2016-12-16 59.00 94.00 35.00 0.3723
## 6 2017-01-21 35.00 50.00 50.00 1.0000
## MemberCardNumber Customer_Group_Num Customer_Group Revenue Invoice
## 1 73 1 ACTIVE 57.55 4
## 2 141 1 ACTIVE 14.20 1
## 3 207 1 ACTIVE 240.76 6
## 4 213 1 ACTIVE 17.35 4
## 5 230 1 ACTIVE 30.70 3
## 6 288 1 ACTIVE 204.00 12
## Nationality_Group NationalityClassification Stateclassification
## 1 KUWAIT KUWAIT NONE
## 2 KUWAIT KUWAIT NONE
## 3 KUWAIT KUWAIT NONE
## 4 KUWAIT KUWAIT NONE
## 5 KUWAIT KUWAIT NONE
## 6 KUWAIT KUWAIT NONE
## AgeClassification RevenueBand_Num RevenueBand InvoiceBand_Num
## 1 REMAINING 3 50-75 3
## 2 REMAINING 2 25-50 1
## 3 18-25 and 26-33 4 Above 75 4
## 4 34-42 and 43-50 2 25-50 3
## 5 REMAINING 2 25-50 2
## 6 18-25 and 26-33 4 Above 75 4
## InvoiceBand OUTLIEROVERALL Sampling_Num Sampling
## 1 50-75 N 2 RG
## 2 Below 25 N 2 RG
## 3 Above 75 N 2 RG
## 4 50-75 N 2 RG
## 5 25-50 N 2 RG
## 6 Above 75 N 2 RG
todaydate <- Sys.Date()
RFM_act_RG_KD25 <- data.frame(customer_id = active_rg_KD25_data$customer_Number,
revenue = active_rg_KD25_data$TotalAmount,
most_recent_visit = active_rg_KD25_data$MaxTransactionDate,
number_of_orders = active_rg_KD25_data$billCount,
recency_days = difftime(todaydate, active_rg_KD25_data$MaxTransactionDate)
)
DT::datatable(RFM_act_RG_KD25)
analysis_date <- max(active_rg_KD25_data$MaxTransactionDate)
### Using "rfm_table_customer()" function from rfm package
rfm_result_RG <- rfm_table_customer(RFM_act_RG_KD25, customer_id, number_of_orders,recency_days, revenue, analysis_date)
########### VISUALIZATONS FOR RFM ANALYSIS
#The heat map shows the average monetary value for different categories of recency and frequency scores.
rfm_heatmap(rfm_result)

#to generate the distribution of monetary scores for the different combinations of frequency and recency scores.
rfm_bar_chart(rfm_result)

# To examine the relative distribution of monetary value (total revenue generated by each customer)
#recency days (days since the most recent visit for each customer)
#frequency (transaction count for each customer)
rfm_histograms(rfm_result)
## Warning: attributes are not identical across measure variables;
## they will be dropped

#Visualize the distribution of customers across orders.
rfm_order_dist(rfm_result)

#Recency vs Monetary Value
rfm_rm_plot(rfm_result)
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

#Frequency vs Monetary Value
rfm_fm_plot(rfm_result)

#Recency vs Frequency
rfm_rf_plot(rfm_result)
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

############
rfm_data2<- data.frame(rfm_result_RG$rfm)
head(rfm_data2)
## customer_id recency_days transaction_count amount recency_score
## 1 73 755 days 1 35.00 5
## 2 141 825 days 1 27.75 1
## 3 207 802 days 1 57.00 3
## 4 213 760 days 4 664.25 5
## 5 230 819 days 1 94.00 2
## 6 288 783 days 1 50.00 4
## frequency_score monetary_score rfm_score
## 1 1 1 511
## 2 1 1 111
## 3 1 2 312
## 4 5 5 555
## 5 1 3 213
## 6 1 2 412
rfm_data2<-arrange(rfm_data2, desc(rfm_data2$rfm_score))
head(rfm_data2)
## customer_id recency_days transaction_count amount recency_score
## 1 213 760 days 4 664.25 5
## 2 434 749 days 3 1578.50 5
## 3 692 756 days 3 220.50 5
## 4 840 763 days 6 231.25 5
## 5 2001 759 days 5 637.00 5
## 6 3136 745 days 3 281.00 5
## frequency_score monetary_score rfm_score
## 1 5 5 555
## 2 5 5 555
## 3 5 5 555
## 4 5 5 555
## 5 5 5 555
## 6 5 5 555
rfm_top50percent_data_RG <- rfm_data2[1:(nrow(rfm_data2)/2),]
head(rfm_top50percent_data_RG)
## customer_id recency_days transaction_count amount recency_score
## 1 213 760 days 4 664.25 5
## 2 434 749 days 3 1578.50 5
## 3 692 756 days 3 220.50 5
## 4 840 763 days 6 231.25 5
## 5 2001 759 days 5 637.00 5
## 6 3136 745 days 3 281.00 5
## frequency_score monetary_score rfm_score
## 1 5 5 555
## 2 5 5 555
## 3 5 5 555
## 4 5 5 555
## 5 5 5 555
## 6 5 5 555
head(rfm_top50percent_data_TG)
## customer_id recency_days transaction_count amount recency_score
## 1 123 755 days 4 208.00 5
## 2 127 752 days 4 458.00 5
## 3 212 749 days 3 749.25 5
## 4 265 766 days 3 793.00 5
## 5 324 762 days 5 609.50 5
## 6 328 745 days 3 280.25 5
## frequency_score monetary_score rfm_score
## 1 5 5 555
## 2 5 5 555
## 3 5 5 555
## 4 5 5 555
## 5 5 5 555
## 6 5 5 555
nrow(rfm_top50percent_data_RG)
## [1] 1118
nrow(rfm_top50percent_data_TG)
## [1] 12745
rg_data <- orignalData %>% select(customer_Number, RevenueBand,TotalAmount,Revenue) %>%
filter(customer_Number %in% rfm_top50percent_data_RG$customer_id)
tg_data <- orignalData %>% select(customer_Number, RevenueBand,TotalAmount, Revenue) %>%
filter( customer_Number %in% rfm_top50percent_data_TG$customer_id)
summary(rg_data)
## customer_Number RevenueBand TotalAmount Revenue
## Min. : 73 Length:1118 Min. : 25.50 Min. : 0.50
## 1st Qu.:12181 Class :character 1st Qu.: 66.81 1st Qu.: 18.61
## Median :25230 Mode :character Median : 122.75 Median : 48.02
## Mean :24553 Mean : 168.21 Mean : 91.05
## 3rd Qu.:36304 3rd Qu.: 212.00 3rd Qu.: 110.28
## Max. :49809 Max. :1578.50 Max. :1969.96
summary(tg_data)
## customer_Number RevenueBand TotalAmount Revenue
## Min. : 22 Length:12745 Min. : 25.5 Min. : 0.20
## 1st Qu.:12642 Class :character 1st Qu.: 62.0 1st Qu.: 18.98
## Median :25294 Mode :character Median : 116.8 Median : 50.20
## Mean :24923 Mean : 161.6 Mean : 89.77
## 3rd Qu.:37146 3rd Qu.: 202.0 3rd Qu.: 114.30
## Max. :49807 Max. :2360.5 Max. :1937.00
table(rg_data$RevenueBand)
##
## 25-50 50-75 Above 75 Below 25
## 245 295 366 212
table(tg_data$RevenueBand)
##
## 25-50 50-75 Above 75 Below 25
## 2827 3520 4176 2222
rg_25to50 <- filter(rg_data, rg_data$RevenueBand == "25-50")
rg_50to75 <- filter(rg_data, rg_data$RevenueBand == "50-75")
rg_Above75 <- filter(rg_data, rg_data$RevenueBand == "Above 75")
rg_Below25 <- filter(rg_data, rg_data$RevenueBand == "Below 25")
tg_25to50 <- filter(tg_data, tg_data$RevenueBand == "25-50")
tg_50to75 <- filter(tg_data, tg_data$RevenueBand == "50-75")
tg_Above75 <- filter(tg_data, tg_data$RevenueBand == "Above 75")
tg_Below25 <- filter(tg_data, tg_data$RevenueBand == "Below 25")
set.seed(420)
tg_25to50_sample = sample_n(tg_25to50, 2450)
set.seed(420)
tg_50to75_sample = sample_n(tg_50to75, 2950)
set.seed(420)
tg_Above75_sample = sample_n(tg_Above75, 3660)
set.seed(420)
tg_Below25_sample = sample_n(tg_Below25, 2120)
tg_data_sample <- rbind(tg_25to50_sample,tg_50to75_sample,tg_Above75_sample,tg_Below25_sample)
t-test
############################## t- test #############################
t.test(tg_data_sample$TotalAmount,rg_data$TotalAmount)
##
## Welch Two Sample t-test
##
## data: tg_data_sample$TotalAmount and rg_data$TotalAmount
## t = -1.3659, df = 1339.9, p-value = 0.1722
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -16.444178 2.944321
## sample estimates:
## mean of x mean of y
## 161.4574 168.2073
STEP 7
################################ Step 7 #######################
rg<- table(rg_data$RevenueBand)
tg<-table(tg_data$RevenueBand)
rg<-transform(rg)
tg<-transform(tg)
rg_tg<-cbind(rg,tg)
names(rg_tg)[1]<-"Revenue_Band"
names(rg_tg)[2]<- "RG_total_customers"
names(rg_tg)[4]<- "TG_total_customers"
rg_tg <- rg_tg[,-3]
rg_tg$TG_AvgAmountPerMember <- c(sum(tg_25to50$Revenue)/nrow(tg_25to50),
sum(tg_50to75$Revenue)/nrow(tg_50to75),
sum(tg_Above75$Revenue)/nrow(tg_Above75),
sum(tg_Below25$Revenue)/nrow(tg_Below25))
rg_tg$RG_AvgAmountPerMember <- c(sum(rg_25to50$Revenue)/nrow(rg_25to50),
sum(rg_50to75$Revenue)/nrow(rg_50to75),
sum(rg_Above75$Revenue)/nrow(rg_Above75),
sum(rg_Below25$Revenue)/nrow(rg_Below25))
DT::datatable(rg_tg)